Setup


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
def get_mbt_corrs(mbts):
    """
    :param list mbts: a list of individual MBT scores
    :returns pandas.DataFrame: pearon-correlation of individual score co-occurrences.
    """
    indiv_mbts = ['I', 'E', 'S', 'N', 'F', 'T', 'J', 'P']
    mbt_dict = dict()
    for val in indiv_mbts:
        mbt_dict[val] = [True if x.find(val) > -1 else False for x in mbts]
    return pd.DataFrame(mbt_dict).corr().ix[indiv_mbts, indiv_mbts]

Hard-coded type frequencies for the US from the Myers-Briggs site


In [3]:
us_types = {'mbt': [('ISFJ', 13.8), ('ESFJ', 12.3),
                    ('ISTJ', 11.6), ('ISFP', 8.8),
                    ('ESTJ', 8.7), ('ESFP', 8.5),
                    ('ENFP', 8.1), ('ISTP', 5.4),
                    ('INFP', 4.4), ('ESTP', 4.3),
                    ('INTP', 3.3), ('ENTP', 3.2),
                    ('ENFJ', 2.5), ('INTJ', 2.1),
                    ('ENTJ', 1.8), ('INFJ', 1.5),
                    ('?', 0.0)],
            'indiv': [('I', 50.7), ('E', 49.3),
                      ('S', 73.3), ('N', 26.7),
                      ('F', 59.8), ('T', 40.2),
                      ('J', 54.1), ('P', 45.9)]}

Import and split data.


In [4]:
df = pd.read_csv('data/MeFites & Myers-Briggs Types.csv')
df.columns = ['timestamp', 'MBT']

# set % based on total submitters
# NOT the total number of submitted types.
submission_count= df.shape[0]

df['MBT'] = df['MBT'].apply(
    lambda x: x.replace('“It\'s always something different"', '?'))
df['MBT'] = df['MBT'].apply(lambda x: x.replace(' ', ''))
df['MBT'] = df['MBT'].apply(
    lambda x: ';'.join([y[0] + y[2:] if len(y) > 4 else y
                        for y in x.split(';')]))

# all_vals: _all_ of the individual types
# (multi-entry submissions split)
all_vals = pd.Series(np.concatenate(df['MBT'].apply(
            lambda x: x.split(';'))))

Making Pictures

Bar Plots for Specific MBTs, MeFi vs. US


In [5]:
a = pd.DataFrame(all_vals.value_counts()).reset_index()
a.columns = ['MBT', 'Count']
a['% MeFi'] = 100. * a['Count'] / submission_count  # a['Count'].sum()

a = a.merge(pd.DataFrame.from_records(
    us_types['mbt'], columns=['MBT', '% US']), on=['MBT'])

b = pd.melt(a, id_vars=['MBT'], value_vars=['% US', '% MeFi'])
b.columns = ['MBT', 'Domain', 'Percentage']
b['Domain'] = b['Domain'].apply(lambda x: x[2:])

In [6]:
sns.set_context('talk', font_scale=0.75)
sns.factorplot(x='MBT', y='Percentage', hue='Domain', data=b.sort_values(
        ['Domain', 'Percentage', 'MBT'], ascending=False), kind='bar', size=8)


/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x1104ddb38>

In [7]:
a.ix[:, ['MBT', '% US', '% MeFi', 'Count']]


Out[7]:
MBT % US % MeFi Count
0 INTJ 2.1 26.595745 150
1 INTP 3.3 18.971631 107
2 INFP 4.4 16.489362 93
3 INFJ 1.5 13.297872 75
4 ENFP 8.1 6.560284 37
5 ? 0.0 6.028369 34
6 ENTP 3.2 4.609929 26
7 ISFJ 13.8 4.609929 26
8 ENFJ 2.5 4.255319 24
9 ENTJ 1.8 4.078014 23
10 ISTJ 11.6 3.723404 21
11 ISTP 5.4 1.950355 11
12 ESFJ 12.3 1.241135 7
13 ESTJ 8.7 1.063830 6
14 ISFP 8.8 1.063830 6
15 ESFP 8.5 0.531915 3
16 ESTP 4.3 0.354610 2

Bar Plots for Individual MBT Components, MeFi vs. US


In [8]:
# all_vals_two: a list of _all_ individual letter categories.
all_vals_two = pd.Series(np.concatenate(
    df['MBT'].apply(lambda x: list(x.replace(';', '').replace('?', '')))))

c = pd.DataFrame(all_vals_two.value_counts()).reset_index()
c.columns = ['MBT', 'Count']
c['% MeFi'] = 100. * c['Count'] / (all_vals_two.shape[0] / 4)

c = c.merge(pd.DataFrame.from_records(
    us_types['indiv'], columns=['MBT', '% US']), on=['MBT'])

d = pd.melt(c, id_vars=['MBT'], value_vars=['% US', '% MeFi'])
d.columns = ['MBT', 'Domain', 'Percentage']
d['Domain'] = d['Domain'].apply(lambda x: x[2:])

In [9]:
sns.set_context('talk', font_scale=1.)
f, axes = plt.subplots(2, 2, figsize=(8, 8), sharey=True)
for ax, order in zip(axes.flat, [('I', 'E'), ('S', 'N'), ('F', 'T'), ('J', 'P')]):
    g = sns.factorplot(x='MBT', y='Percentage', hue='Domain',
                       data=d, kind='bar',  order=order, ax=ax, legend=False)
    g.despine(left=True)
    ax.set(ylim=(0, 90.))
    ax.set_xlabel('')
for i in [0, 2, 3]:
    f.axes[i].legend([])
for i in [1, 3]:
    f.axes[i].set_ylabel('')
for i in [0, 2]:
    f.axes[i].set_ylabel('Percentage')


/Users/pmlandwehr/anaconda3/lib/python3.5/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [10]:
c.ix[[1, 6, 7, 0, 4, 3, 2, 5], ['MBT', '% US', '% MeFi', 'Count']]


Out[10]:
MBT % US % MeFi Count
1 I 50.7 79.254457 489
6 E 49.3 20.745543 128
7 S 73.3 13.290113 82
0 N 26.7 86.709887 535
4 P 45.9 46.191248 285
3 J 54.1 53.808752 332
2 T 40.2 56.077796 346
5 F 59.8 43.922204 271

Correlation plot for general MeFi categories


In [11]:
# all_vals_three: all specific values that aren't "?"
all_vals_three = all_vals.ix[all_vals != '?']

corrs = get_mbt_corrs(all_vals_three)

In [12]:
sns.heatmap(corrs, vmax=0.4, vmin=-0.4, annot=True)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1104cfda0>

Correlation plot for general US categories.


In [13]:
all_vals_four = np.repeat([x[0] for x in us_types['mbt']], [x[1]*10 for x in us_types['mbt']])
corrs_two = get_mbt_corrs(all_vals_four)

In [14]:
sns.heatmap(corrs_two, vmax=0.4, vmin=-0.4, annot=True)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1116791d0>